Conceptual


Q1

(a)

\(a_1=\beta_0,b_1=\beta_1,c_1=\beta_2,d_1=\beta_4\)

(b)

\(a_2=\beta_0-\beta_4\xi^3,b_2=\beta_1+3\beta_4\xi^2,c_2=\beta_2-3\beta_4\xi,d_2=\beta_3+\beta_4\)

(c)

\[ \begin{aligned} f_2(\xi)&=\beta_0-\beta_4\xi^3+\beta_1\xi+3\beta_4\xi^3+\beta_2\xi^2-3\beta_4\xi^3+\beta_3\xi^3+\beta_4\xi^3\\ &=\beta_0+\beta_1\xi+\beta_2\xi^2+\beta_3\xi^3=f_1(\xi)\\ \end{aligned} \]

(d)

\[ \begin{aligned} f^\prime_2(x)&=\beta_1+3\beta_4\xi^2+(2\beta_2-6\beta_4\xi)x+(3\beta_4+3\beta_3)x^2\\ f^\prime_1(x)&=\beta_1+2\beta_2x+3\beta_3x^2 \end{aligned}\\ \begin{aligned} \rightarrow f^\prime_2(\xi)&=\beta_1+3\beta_4\xi^2+2\beta_2\xi-6\beta_4\xi^2+3\beta_4\xi^2+3\beta_3\xi^2\\ &=\beta_1+2\beta_2\xi+3\beta_3\xi^2=f^\prime_1(\xi) \end{aligned} \]

(e)

\[ \begin{aligned} f^{\prime\prime}_2(x)&=2\beta_2-6\beta_4\xi+(6\beta_4+6\beta_3)x\\ f^{\prime\prime}_1(x)&=2\beta_2+6\beta_3x \end{aligned}\\ \begin{aligned} \rightarrow f^{\prime\prime}_2(x)&=2\beta_2-6\beta_4\xi+6\beta_4\xi+6\beta_3\xi\\ &=2\beta_2+6\beta_3\xi=f^{\prime\prime}_1(x) \end{aligned} \]

Q2

(a)

When \(\lambda\) is very large, the first term can be ignored. So when \(g^{{0}}=g=0\), we can optimize the equation.

(b)

When \(\lambda\) is very large, \(g^{(1)}=0\) means that \(g\) is a constant function.

(c)

When \(\lambda\) is very large, \(g^{(2)}=0\) means that \(g\) is a linear function.

(d)

When \(\lambda\) is very large, \(g^{(1)}=0\) means that \(g\) is a quadratic function.

(e)

When \(\lambda\) is very l0, the second term can be ignored and \(g\) is the optimail solution of the least squares. It is obvious that g is a steo function which across every point.

Q3

x1=seq(-2,1,0.01)
y1=1+x1
plot(x1,y1,xlim=c(-2,2),ylim=c(-5,5))
x2=seq(1,2,0.01)
y2=1+x2-2*(x2-1)^2
points(x2,y2)

One knot at 1.

Q4

x1=seq(-2,0,0.01)
y1=rep(1,length(x1))
plot(x1,y1,xlim=c(-2,2),ylim=c(-3,3))
x2=seq(0,1,0.01)
y2=rep(2,length(x2))
points(x2,y2)
x3=seq(1,2,0.01)
y3=3-x3
points(x3,y3)

#Another way to plot using ggplot
# x.1 <- seq(-6, 0, 0.1)  # [-6,0)
# x.2 <- seq(0, 1, 0.1)   # [0,1)
# x.3 <- seq(1, 2, 0.1)   # [1,2]
# x.4 <- seq(2, 3, 0.1)   # (2,3)
# x.5 <- seq(3, 4, 0.1)   # [3,4]
# x.6 <- seq(4, 5, 0.1)   # (4,5]
# x.7 <- seq(5, 6, 0.1)   # (5,6)
# y.1 <- rep(1, length(x.1)) 
# y.2 <- rep(2, length(x.2))
# y.3 <- 3 - x.3
# y.4 <- rep(1, length(x.4))
# y.5 <- 3*x.5 - 8
# y.6 <- rep(4, length(x.6))
# y.7 <- rep(1, length(x.7))
# df <- data.frame(X = c(x.1,x.2,x.3,x.4,x.5,x.6,x.7),
#                  Y = c(y.1,y.2,y.3,y.4,y.5,y.6,y.7))
# p <- ggplot(df, aes(x=X,y=Y)) + geom_line(size=1.5)
# rect <- data.frame(xmin=-2, xmax=2, ymin=-Inf, ymax=Inf)
# p + geom_rect(data=rect, aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax),
#               fill="grey20",
#               alpha=0.4,
#               inherit.aes = FALSE)

Two knots at 0 and 1

Q5

(a)

\(\hat g_2\). Since \(g^{(3)}\) is more strict than \(g^{(4)}\) in terms of smmothness, \(\hat g_2\) is more flexible and can fit better in the training data.

(b)

It depends on the form of real y.

(c)

Same results.


Applied


Q6

(a)

library(ISLR)
library(boot)
set.seed(1)
cv.error.10=rep(0,10)
for(i in 1:10){
poly.fit=glm(wage~poly(age,i),data=Wage)
cv.error.10[i]=cv.glm(Wage,poly.fit,K=10)$delta[1]
}
degree=which.min(cv.error.10)
poly.fit=lm(wage~poly(age,degree),data=Wage)
fit.1=lm(wage~poly(age,1),data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,poly.fit,fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, degree)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.8118 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.9038  0.001666 ** 
## 4   2990 4756703  6     20971   2.1970  0.040533 *  
## 5   2994 4770322 -4    -13619   2.1401  0.073355 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lim=range(Wage$age)
grid=seq(lim[1],lim[2])
preds=predict(poly.fit,newdata=list(age=grid),se=TRUE)
plot(Wage$age,Wage$wage)
lines(grid,preds$fit,lwd=2,col='red',xlim=lim)

Degree 4 is chosen.

(b)

set.seed(1)
cv.error.10=rep(0,10)

for(i in 1:10){
  #Cut data first
 Wage$age.cut <- cut(Wage$age,i+1)
 glm.fit <- glm(wage~age.cut, data=Wage)
  cv.error.10[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
#step.fit=glm(wage~cut(age,i+1),data=Wage)    
#cv.error.10[i]=cv.glm(Wage,step.fit,K=10)$delta[1]
}
number=which.min(cv.error.10)

plot(2:11,cv.error.10,type='b')

step.fit=glm(wage~cut(age,8),data=Wage)
preds=predict(step.fit,newdata=list(age=grid),se=TRUE)
se.bands=preds$fit + cbind(2*preds$se.fit, -2*preds$se.fit)
plot(Wage$age,Wage$wage)
lines(grid, preds$fit, lwd=2, col="blue")
matlines(grid, se.bands, lwd=1, col="blue", lty=3)

The number of optimal cuts is 8.

Q7

par(mfrow=c(2,1))
plot(Wage$maritl,Wage$wage)
plot(Wage$jobclass,Wage$wage)

Both maritl and jobclass are categorical data. For jobclass, jobclass=information seems provids more salary than industrial. For maritl, married people seem to obtain the highest wages.

library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.16.1
gam.fit1 <- gam(wage~ns(age,4), data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
gam.fit2.1 <- gam(wage~ns(age,4)+maritl, data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
gam.fit2.2 <- gam(wage~ns(age,)+jobclass, data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
gam.fit3 <- gam(wage~ns(age,4)+maritl+jobclass, data=Wage)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
anova(gam.fit1, gam.fit2.1, gam.fit3)
## Analysis of Deviance Table
## 
## Model 1: wage ~ ns(age, 4)
## Model 2: wage ~ ns(age, 4) + maritl
## Model 3: wage ~ ns(age, 4) + maritl + jobclass
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      2995    4773109                          
## 2      2991    4650697  4   122412 < 2.2e-16 ***
## 3      2990    4479913  1   170784 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,3))
plot(gam.fit3, se=TRUE, col="blue")

Q8

pairs(Auto)

It can be seen that ’displacement,horsepower,weight,acceleration` may have nonlinear relationships.

gam.fit=gam(mpg~s(displacement,4)+s(horsepower,5)+year+cylinders,data=Auto)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
par(mfrow=c(1,4))
plot(gam.fit, se=TRUE, col="blue")

Q9

(a)

library(MASS)
fit=lm(nox~poly(dis,3),Boston)
summary(fit)
## 
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16
lim=range(Boston$dis)
grid=seq(lim[1],lim[2],0.1)
preds=predict(fit,newdata=list(dis=grid),se=TRUE)
se.bands <- preds$fit + cbind(2*preds$se.fit, -2*preds$se.fit)
plot(Boston$dis,Boston$nox)
lines(grid,preds$fit,lwd=2,col='red',xlim=lim)
matlines(grid,se.bands,col='red')

Intercept is 0.554695, parameters for i-th degree of dis are -2.003096,0.856330 and -0.318049 for i=1,2,3.

(b)

error=rep(0,10)
for(i in 1:10){
fit=lm(nox~poly(dis,i),Boston)
preds=predict(fit,Boston)
error[i]=sum((preds-Boston$nox)^2)  #or fit$residuals^2
}
error
##  [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484
##  [8] 1.835630 1.833331 1.832171

(c)

set.seed(1)
cv.error.10=rep(0,10)
for(i in 1:10){
poly.fit=glm(nox~poly(dis,i),data=Boston)
cv.error.10[i]=cv.glm(Boston,poly.fit,K=10)$delta[1]
}
which.min(cv.error.10)
## [1] 4

Chose degree 4.

(d)

library(splines)
fit=lm(nox~bs(dis,df=4),data=Boston)
attr(bs(Boston$dis,df=4),'knots')
##     50% 
## 3.20745
pred=predict(fit,newdata=list(dis=grid))
plot(Boston$dis,Boston$nox)
lines(grid,pred,lwd=2,col='red')

(e)

par(mfrow=c(2,3))
error=rep(0,9)
for(i in 3:11){
fit=lm(nox~bs(dis,df=i),data=Boston)
attr(bs(Boston$dis,df=i),'knots')
pred=predict(fit,newdata=list(dis=grid))
plot(Boston$dis,Boston$nox)
lines(grid,pred,lwd=2,col='red')
error[i-2]=sum(fit$residuals^2)
}

which.min(error)
## [1] 8

When degree freedom is 10, we can obtain the smallest RSS.

(f)

set.seed(1)
cv.error.10=rep(0,7)
for(i in 4:10){
poly.fit=glm(nox~bs(dis,df=i),data=Boston)
cv.error.10[i-3]=cv.glm(Boston,poly.fit,K=10)$delta[1]
}
## Warning in bs(dis, degree = 3L, knots = c(`50%` = 3.23925), Boundary.knots
## = c(1.137, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`50%` = 3.23925), Boundary.knots
## = c(1.137, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`50%` = 3.2157), Boundary.knots =
## c(1.1296, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases

## Warning in bs(dis, degree = 3L, knots = c(`50%` = 3.2157), Boundary.knots =
## c(1.1296, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(`33.33333%` = 2.35953333333333, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`33.33333%` = 2.35953333333333, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`33.33333%` = 2.43386666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`33.33333%` = 2.43386666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`25%` = 2.10215, `50%` =
## 3.1523, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases

## Warning in bs(dis, degree = 3L, knots = c(`25%` = 2.10215, `50%` =
## 3.1523, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(`25%` = 2.0493, `50%` = 3.0921, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`25%` = 2.0493, `50%` = 3.0921, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`20%` = 1.9512, `40%` = 2.6403, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`20%` = 1.9512, `40%` = 2.6403, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`16.66667%` = 1.82203333333333, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`16.66667%` = 1.82203333333333, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`16.66667%` = 1.85586666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`16.66667%` = 1.85586666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`14.28571%` = 1.79318571428571, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(`14.28571%` = 1.79318571428571, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(`14.28571%` = 1.7821, `28.57143%`
## = 2.1678, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases

## Warning in bs(dis, degree = 3L, knots = c(`14.28571%` = 1.7821, `28.57143%`
## = 2.1678, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(`12.5%` = 1.743225, `25%` =
## 2.0941, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases

## Warning in bs(dis, degree = 3L, knots = c(`12.5%` = 1.743225, `25%` =
## 2.0941, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(dis, degree = 3L, knots = c(`12.5%` = 1.7519375, `25%` =
## 2.070275, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases

## Warning in bs(dis, degree = 3L, knots = c(`12.5%` = 1.7519375, `25%` =
## 2.070275, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
which.min(cv.error.10)
## [1] 4

Using cross-validation, degree freedom should be 10.

Q10

(a)

library(leaps)
train_index=sample(nrow(College),7*nrow(College)/10)
train=College[train_index,]
test=College[-train_index,]

predict.regsubsets <- function(object, newdata, id, ...){
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefi <- coef(object, id=id)
  xvars <- names(coefi)
  mat[,xvars]%*%coefi
}

# forward selection
reg.fwd=regsubsets(Outstate~.,data=train,method='forward',nvmax=17)
regsum=summary(reg.fwd)

err.fwd <- rep(0, ncol(College)-1)
for(i in 1:(ncol(College)-1)) {
  pred.fwd <- predict(reg.fwd, test, id=i)
  err.fwd[i] <- mean((test$Outstate - pred.fwd)^2)
}

par(mfrow=c(1,2))
plot(err.fwd,main="Test MSE", xlab="Number of Predictors")
min.mse <- which.min(err.fwd)  
points(min.mse, err.fwd[min.mse], col="red", pch=4, lwd=5)
plot(regsum$adjr2, type="b", main="Adjusted R^2", xlab="Number of Predictors")
max.adjr2 <- which.max(regsum$adjr2)  
points(max.adjr2, regsum$adjr2[max.adjr2], col="red", pch=4, lwd=5)

(b)

gam.fit <- gam(Outstate ~ 
                 Private +   
                 s(Room.Board,3) + 
                 s(Terminal,3) + 
                 s(perc.alumni,3) +
               s(Expend,3) + 
                 s(Grad.Rate,3), 
               data=College)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
par(mfrow=c(2,3))
plot(gam.fit, se=TRUE, col="blue")

### (c)

pred <- predict(gam.fit, test)
(mse.error <- mean((test$Outstate - pred)^2))
## [1] 2940903
err.fwd[6]
## [1] 3757382

(d)

summary(gam.fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 3) + s(Terminal, 
##     3) + s(perc.alumni, 3) + s(Expend, 3) + s(Grad.Rate, 3), 
##     data = College)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7110.16 -1137.02    50.44  1285.38  8278.86 
## 
## (Dispersion Parameter for gaussian family taken to be 3520187)
## 
##     Null Deviance: 12559297426 on 776 degrees of freedom
## Residual Deviance: 2675342725 on 760.0001 degrees of freedom
## AIC: 13936.36 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 3366732308 3366732308 956.407 < 2.2e-16 ***
## s(Room.Board, 3)    1 2549088628 2549088628 724.134 < 2.2e-16 ***
## s(Terminal, 3)      1  802254341  802254341 227.901 < 2.2e-16 ***
## s(perc.alumni, 3)   1  525154274  525154274 149.184 < 2.2e-16 ***
## s(Expend, 3)        1 1022010841 1022010841 290.329 < 2.2e-16 ***
## s(Grad.Rate, 3)     1  151344060  151344060  42.993 1.014e-10 ***
## Residuals         760 2675342725    3520187                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df Npar F   Pr(F)    
## (Intercept)                                 
## Private                                     
## s(Room.Board, 3)        2  2.591 0.07557 .  
## s(Terminal, 3)          2  2.558 0.07815 .  
## s(perc.alumni, 3)       2  0.835 0.43446    
## s(Expend, 3)            2 56.179 < 2e-16 ***
## s(Grad.Rate, 3)         2  3.363 0.03515 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova for Parametric Effects measures linear relationships and Anova for Nonparametric Effects measures nonlinear relationships.

Strong evidence of non-linear effects for Expend, some evidence for Room.Board, Terminal and Grad.Rate, and no evidence for perc.alumni.

Q11

(a)

epi=rnorm(100)
x1=rnorm(100)
x2=rnorm(100)
y=1+2*x1+3*x2+epi

(b)

let \(\beta_1\) equals 5.

(c)

beta1=5
a=y-beta1*x1
beta2=lm(a~x2)$coef[2]

(d)

beta1=5
a=y-beta2*x2
beta1=lm(a~x1)$coef[2]

(e)

beta1=6
beta_0=rep(0,1000)
beta_1=rep(0,1000)
beta_2=rep(0,1000)
for(i in 1:1000){
  a=y-beta1*x1
  beta2=lm(a~x2)$coef[2]
  a=y-beta2*x2
  beta1=lm(a~x1)$coef[2]
  
  beta_0[i]=lm(a~x1)$coef[1]
  beta_1[i]=beta1
  beta_2[i]=beta2
}
x=seq(1000)
par(mfrow=c(3,1))
plot(x,beta_0,ylim=c(0,7))
plot(x,beta_1,col='red',ylim=c(0,7))
plot(x,beta_2,col='blue',ylim=c(0,7))

(f)

fit.lm=lm(y~x1+x2)
summary(fit)
## 
## Call:
## lm(formula = nox ~ bs(dis, df = i), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12399 -0.03863 -0.01051  0.02591  0.18950 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.65326    0.03104  21.047  < 2e-16 ***
## bs(dis, df = i)1   0.03797    0.05409   0.702  0.48307    
## bs(dis, df = i)2   0.11127    0.03497   3.182  0.00156 ** 
## bs(dis, df = i)3  -0.03120    0.03740  -0.834  0.40456    
## bs(dis, df = i)4  -0.01168    0.03400  -0.344  0.73123    
## bs(dis, df = i)5  -0.11788    0.03676  -3.207  0.00143 ** 
## bs(dis, df = i)6  -0.15721    0.03532  -4.452 1.05e-05 ***
## bs(dis, df = i)7  -0.15300    0.03574  -4.281 2.23e-05 ***
## bs(dis, df = i)8  -0.21135    0.03463  -6.103 2.11e-09 ***
## bs(dis, df = i)9  -0.21396    0.04641  -4.610 5.14e-06 ***
## bs(dis, df = i)10 -0.27976    0.06611  -4.232 2.76e-05 ***
## bs(dis, df = i)11 -0.22988    0.06329  -3.632  0.00031 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06031 on 494 degrees of freedom
## Multiple R-squared:  0.735,  Adjusted R-squared:  0.7291 
## F-statistic: 124.6 on 11 and 494 DF,  p-value: < 2.2e-16
par(mfrow=c(3,1))
plot(x,beta_0,ylim=c(0,7))
abline(h=coef(fit.lm)[1], lty="dashed", lwd=3, col="brown")
plot(x,beta_1,col='red',ylim=c(0,7))
abline(h=coef(fit.lm)[2], lty="dashed", lwd=3, col="darkgreen")
plot(x,beta_2,col='blue',ylim=c(0,7))
abline(h=coef(fit.lm)[3], lty="dashed", lwd=3, col="darkblue")

(g)

In this dataset, three iterations are enough.

Q12

n=100
p=100
data=matrix(rnorm(n*p), ncol=p, nrow=n)
set.seed(1)
y=rep(0,100)
for( i in 1:100){
  data[,i]=rnorm(100)
  y=y+i*data[,i]
}
y=y+epi
data=data.frame(data)
mse.error <- rep(0, 2000)
beta=data.frame(sample(100))
##Backfitting
for(i in 1:2000){
  for(j in 1:100){
    a=y-as.matrix(data[,-(100-j+1)])%*%as.matrix(beta[-(100-j+1),])
    beta_in=lm(a~data[,100-j+1])$coef[2]
    beta[100-j+1,]=beta_in
  }
  mse.error[i] <- mean((y - as.matrix(data)%*%as.matrix(beta))^2)
}
plot(900:1000,mse.error[900:1000])

Over 2000 iterations.